## importing the dataset and omitting the missing values
datafile <- read.csv("../WiscNursingHome.csv")
datafile <- na.omit(datafile)

## dividing data from different years


datafile$ORGSTR <- as.factor(datafile$ORGSTR)
datafile$MSA <- as.factor(datafile$MSA)
datafile$UTILIZATION_RATE <- datafile$TPY / datafile$NUMBED
# Assume the data is stored in a dataframe called "nursing_home_data"
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Count the number of occurrences of each hospID
hospID_counts <- datafile %>% group_by(hospID) %>% summarise(count = n())

# Keep only the hospID that appear more than once
to_keep <- hospID_counts$hospID[hospID_counts$count > 1]

# Filter the original dataframe to keep only the rows with hospID that appear more than once
datafile_filtered <- datafile %>% filter(hospID %in% to_keep)
library("dplyr")
years_comparison <- datafile %>% 
  group_by(hospID) %>%
  summarise(SQRFOOT_2000 =  SQRFOOT[CRYEAR == 2000], SQRFOOT_2001 =  SQRFOOT[CRYEAR == 2001],
            NUMBED_2000 = NUMBED[CRYEAR == 2000], NUMBED_2001 = NUMBED[CRYEAR == 2001],
            TPY_2000 =  TPY[CRYEAR == 2000], TPY_2001 =  TPY[CRYEAR == 2001],
            NUMBED_2000 = NUMBED[CRYEAR == 2000], NUMBED_2001 = NUMBED[CRYEAR == 2001],
            MSA_2000 =  MSA[CRYEAR == 2000], MSA_2001 =  MSA[CRYEAR == 2001],
            URBAN_2000 = URBAN[CRYEAR == 2000], URBAN_2001 = URBAN[CRYEAR == 2001],
            TAXEXEMPT_2000 =  TAXEXEMPT[CRYEAR == 2000], TAXEXEMPT_2001 =  TAXEXEMPT[CRYEAR == 2001],
            PRO_2000 = PRO[CRYEAR == 2000], PRO_2001 = PRO[CRYEAR == 2001],
            MCERT_2000 =  MCERT[CRYEAR == 2000], MCERT_2001 =  MCERT[CRYEAR == 2001],
            SELFFUNDINS_2000 = SELFFUNDINS[CRYEAR == 2000], SELFFUNDINS_2001 = SELFFUNDINS[CRYEAR == 2001],
            ORGSTR_2000 =  ORGSTR[CRYEAR == 2000], ORGSTR_2001 =  ORGSTR[CRYEAR == 2001], UTILIZATION_RATE_2000 = UTILIZATION_RATE[CRYEAR == 2000], UTILIZATION_RATE_2001 = UTILIZATION_RATE[CRYEAR == 2001])
## `summarise()` has grouped output by 'hospID'. You can override using the
## `.groups` argument.
years_comparison
## # A tibble: 346 × 23
## # Groups:   hospID [346]
##    hospID SQRFOOT_2000 SQRFOOT…¹ NUMBE…² NUMBE…³ TPY_2…⁴ TPY_2…⁵ MSA_2…⁶ MSA_2…⁷
##     <int>        <dbl>     <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <fct>   <fct>  
##  1    101         10.9      10.9      18      18    16.5    16.7 0       0      
##  2    103         19.8      19.8      63      50    59.2    45.6 0       0      
##  3    105         26.9      26.9      54      54    49.6    52.4 1       1      
##  4    107         26.3      26.3      60      60    51.9    57.4 0       0      
##  5    108         30.7      30.7     104     104    94.6    95.6 10      10     
##  6    109         24.3      24.3      79      79    69.7    70.2 11      11     
##  7    110         34.4      34.4     105     105    95.6    97.7 11      11     
##  8    111         52.1      46.0     129     108    95.2    98.4 6       6      
##  9    112         71.8      71.8     200     200   193.    183.  10      10     
## 10    113         63.4      63.4     184     184   156.    153.  6       6      
## # … with 336 more rows, 14 more variables: URBAN_2000 <int>, URBAN_2001 <int>,
## #   TAXEXEMPT_2000 <int>, TAXEXEMPT_2001 <int>, PRO_2000 <int>, PRO_2001 <int>,
## #   MCERT_2000 <int>, MCERT_2001 <int>, SELFFUNDINS_2000 <int>,
## #   SELFFUNDINS_2001 <int>, ORGSTR_2000 <fct>, ORGSTR_2001 <fct>,
## #   UTILIZATION_RATE_2000 <dbl>, UTILIZATION_RATE_2001 <dbl>, and abbreviated
## #   variable names ¹​SQRFOOT_2001, ²​NUMBED_2000, ³​NUMBED_2001, ⁴​TPY_2000,
## #   ⁵​TPY_2001, ⁶​MSA_2000, ⁷​MSA_2001
data_2000 <- datafile_filtered[datafile_filtered$CRYEAR == 2000, ]
data_2001 <- datafile_filtered[datafile_filtered$CRYEAR == 2001, ]
data_2001$PREV_UTILIZATION_RATE <- data_2000$UTILIZATION_RATE
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

For the moment we just work on the data coming from the 2000 survey, to avoid dependency between observations.

Quantitative variables only

First we consider the quantitative variables only.

Let’s see the correlation:

ggpairs(subset(data_2001, select = c(SQRFOOT, NUMBED, TPY, PREV_UTILIZATION_RATE)))

cor(log(data_2001$TPY),(data_2001$PREV_UTILIZATION_RATE))
## [1] 0.2124294

We can see the strong correlation between all the three variables. In particular we see the almost perfectly linear correlation between TPY and NUMBED.

Simple linear models

Being NUMBED the most linearly correlated variable w.r.t. PTY, we start modelling PTY using NUMBED and subsequently add SQRFOOT and the interaction between the two:

summary(lm(TPY ~ PREV_UTILIZATION_RATE + SQRFOOT, data = data_2001))
## 
## Call:
## lm(formula = TPY ~ PREV_UTILIZATION_RATE + SQRFOOT, data = data_2001)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -101.081  -15.096   -3.165   15.082   90.070 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             1.1600    14.7886   0.078    0.938    
## PREV_UTILIZATION_RATE  38.2421    16.2273   2.357    0.019 *  
## SQRFOOT                 1.0621     0.0419  25.348   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 26.37 on 343 degrees of freedom
## Multiple R-squared:  0.6605, Adjusted R-squared:  0.6585 
## F-statistic: 333.6 on 2 and 343 DF,  p-value: < 2.2e-16

The t test related to SQRFOOT does not give enough evidence against the null hypotesis. To understand better the role of the variable we can perform the analysis of variance:

anova(glm(TPY ~ NUMBED + SQRFOOT + NUMBED:SQRFOOT, data = data_2001, family = gaussian),
      test = "Chisq")
## Analysis of Deviance Table
## 
## Model: gaussian, link: identity
## 
## Response: TPY
## 
## Terms added sequentially (first to last)
## 
## 
##                Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                             345     702732              
## NUMBED          1   683048       344      19684 < 2.2e-16 ***
## SQRFOOT         1      941       343      18743 3.398e-05 ***
## NUMBED:SQRFOOT  1        4       342      18739    0.7785    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

As we can see, despite being NUMBED and SQRFOOT highly correlated, it is worth including the second variable to the model too. Adding the interaction, instead, seems not to give any important information to the model.

We can also use Akaike information criterium to check this result (here we compare some of the possible combinations):

AIC(lm(TPY ~ NUMBED, data = data_2001),
    lm(TPY ~ SQRFOOT, data = data_2001),
    lm(TPY ~ NUMBED + SQRFOOT, data = data_2001),
    lm(TPY ~ NUMBED:SQRFOOT, data = data_2001),
    lm(TPY ~ NUMBED + SQRFOOT + NUMBED:SQRFOOT, data = data_2001),
    lm(TPY ~ NUMBED + NUMBED:SQRFOOT, data = data_2001),
    lm(TPY ~ SQRFOOT + NUMBED:SQRFOOT, data = data_2001))
##                                                               df      AIC
## lm(TPY ~ NUMBED, data = data_2001)                             3 2386.140
## lm(TPY ~ SQRFOOT, data = data_2001)                            3 3254.947
## lm(TPY ~ NUMBED + SQRFOOT, data = data_2001)                   4 2371.185
## lm(TPY ~ NUMBED:SQRFOOT, data = data_2001)                     3 3149.517
## lm(TPY ~ NUMBED + SQRFOOT + NUMBED:SQRFOOT, data = data_2001)  5 2373.105
## lm(TPY ~ NUMBED + NUMBED:SQRFOOT, data = data_2001)            4 2379.298
## lm(TPY ~ SQRFOOT + NUMBED:SQRFOOT, data = data_2001)           4 3150.156

Again the analysis indicates NUMBED as the most relevant variable and the interaction between NUMBED and SQRFOOT as basically non relevant.

Models with log-transformed predictor

We can try to improve the model by log-transforming the predictors:

AIC(lm(TPY ~ log(NUMBED), data = data_2001),
    lm(TPY ~ log(SQRFOOT), data = data_2001),
    lm(TPY ~ log(NUMBED) + log(SQRFOOT), data = data_2001),
    lm(TPY ~ log(NUMBED) + SQRFOOT, data = data_2001),
    lm(TPY ~ NUMBED + log(SQRFOOT), data = data_2001))
##                                                        df      AIC
## lm(TPY ~ log(NUMBED), data = data_2001)                 3 2931.458
## lm(TPY ~ log(SQRFOOT), data = data_2001)                3 3283.465
## lm(TPY ~ log(NUMBED) + log(SQRFOOT), data = data_2001)  4 2926.603
## lm(TPY ~ log(NUMBED) + SQRFOOT, data = data_2001)       4 2825.379
## lm(TPY ~ NUMBED + log(SQRFOOT), data = data_2001)       4 2378.719

We cannot see any improvement.

Graphical analysis

At the moment, the best model (according to AIC) seems to be the one with just the single linear contributions of the two variables. Let’s analyze the residuals:

fit.linear <- lm(TPY ~ NUMBED + SQRFOOT, data = data_2001)
summary(fit.linear)
## 
## Call:
## lm(formula = TPY ~ NUMBED + SQRFOOT, data = data_2001)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -63.816  -2.385   0.837   3.514  32.744 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.20766    0.89904  -0.231    0.817    
## NUMBED       0.88207    0.01379  63.985  < 2e-16 ***
## SQRFOOT      0.08055    0.01941   4.151 4.19e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.392 on 343 degrees of freedom
## Multiple R-squared:  0.9733, Adjusted R-squared:  0.9732 
## F-statistic:  6259 on 2 and 343 DF,  p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(fit.linear)

The residuals are not exactly as we would expect from a good linear model. In particular there seem to be a couple of outliers, the residuals are not normally distributed on the edges and homoscedasticity is not satisfied.

Let’s also try to inspect the residuals’ plots for the other good models (according to AIC) we got previously:

par(mfrow = c(2, 2))
plot(lm(TPY ~ NUMBED, data = data_2001))

par(mfrow = c(2, 2))
plot(lm(TPY ~ NUMBED + SQRFOOT + NUMBED:SQRFOOT, data = data_2001))

par(mfrow = c(2, 2))
plot(lm(TPY ~ NUMBED + NUMBED:SQRFOOT, data = data_2001))

par(mfrow = c(2, 2))
plot(lm(TPY ~ NUMBED + log(SQRFOOT), data = data_2001))

None of the models above seems to satisfy the assumptions on the linear model.

Let’s now look at the distribution of the data:

par(mfrow = c(2, 2))
hist(data_2001$TPY, xlab = "TPY", freq = TRUE)
hist(data_2001$NUMBED, xlab = "NUMBED", freq = TRUE)
hist(data_2001$SQRFOOT, xlab = "SQRFOOT", freq = TRUE)

As we can see all of the variables are strongly skewed.

Models with log-transformed TPY

First let’s see the correlation:

## fare ggpairs() con il log delle variabili

We see that also TPY is strongly skewed, therefore we can try to model its log-transform:

summary(lm(log(TPY) ~ log(NUMBED) + log(SQRFOOT) + log(NUMBED):log(SQRFOOT), data = data_2001))
## 
## Call:
## lm(formula = log(TPY) ~ log(NUMBED) + log(SQRFOOT) + log(NUMBED):log(SQRFOOT), 
##     data = data_2001)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.87833 -0.02192  0.01586  0.05139  0.27106 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -0.170946   0.204398  -0.836    0.404    
## log(NUMBED)               0.996138   0.048397  20.583   <2e-16 ***
## log(SQRFOOT)              0.032319   0.057250   0.565    0.573    
## log(NUMBED):log(SQRFOOT) -0.001251   0.012330  -0.101    0.919    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09381 on 342 degrees of freedom
## Multiple R-squared:  0.9651, Adjusted R-squared:  0.9648 
## F-statistic:  3155 on 3 and 342 DF,  p-value: < 2.2e-16
anova(glm(log(TPY) ~ log(NUMBED) + log(SQRFOOT) + log(NUMBED):log(SQRFOOT), data = data_2001,
          family = gaussian), test = "Chisq")
## Analysis of Deviance Table
## 
## Model: gaussian, link: identity
## 
## Response: log(TPY)
## 
## Terms added sequentially (first to last)
## 
## 
##                          Df Deviance Resid. Df Resid. Dev Pr(>Chi)    
## NULL                                       345     86.308             
## log(NUMBED)               1   83.268       344      3.040  < 2e-16 ***
## log(SQRFOOT)              1    0.030       343      3.010  0.06415 .  
## log(NUMBED):log(SQRFOOT)  1    0.000       342      3.010  0.91920    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

In this case even the main effect of SQRFOOT seems to be non relevant. Let’s give a look at the plot of the model with just log(NUMBED) and with just log(SQRFOOT):

summary(lm(log(TPY) ~ log(NUMBED), data = data_2001))
## 
## Call:
## lm(formula = log(TPY) ~ log(NUMBED), data = data_2001)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.87488 -0.02220  0.01507  0.05291  0.28856 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.17468    0.04706  -3.711  0.00024 ***
## log(NUMBED)  1.01924    0.01050  97.071  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.094 on 344 degrees of freedom
## Multiple R-squared:  0.9648, Adjusted R-squared:  0.9647 
## F-statistic:  9423 on 1 and 344 DF,  p-value: < 2.2e-16
summary(lm(log(TPY) ~ log(SQRFOOT), data = data_2001))
## 
## Call:
## lm(formula = log(TPY) ~ log(SQRFOOT), data = data_2001)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.67815 -0.16207  0.02277  0.18601  1.05067 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.88032    0.09667   19.45   <2e-16 ***
## log(SQRFOOT)  0.66841    0.02564   26.07   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2904 on 344 degrees of freedom
## Multiple R-squared:  0.6639, Adjusted R-squared:  0.663 
## F-statistic: 679.6 on 1 and 344 DF,  p-value: < 2.2e-16

And to their residuals:

par(mfrow = c(2, 2))
plot(lm(log(TPY) ~ log(NUMBED) + log(SQRFOOT) + log(PREV_UTILIZATION_RATE), data = data_2001))

par(mfrow = c(2, 2))
plot(lm(log(TPY) ~ log(SQRFOOT), data = data_2001))

We look at the AIC too:

AIC(lm(log(TPY) ~ log(NUMBED), data = data_2001),
    lm(log(TPY) ~ log(SQRFOOT), data = data_2001),
    lm(log(TPY) ~ log(NUMBED) + log(SQRFOOT), data = data_2001))
##                                                             df       AIC
## lm(log(TPY) ~ log(NUMBED), data = data_2001)                 3 -650.2733
## lm(log(TPY) ~ log(SQRFOOT), data = data_2001)                3  130.1727
## lm(log(TPY) ~ log(NUMBED) + log(SQRFOOT), data = data_2001)  4 -651.7229

Again NUMBED seems to be a good predictor, but the residuals suggest that the assumption on the linear model are not satisfied.

On the other hand SQRFOOT seems to be less relevant but with mush better residuals (apart from the high leverage outlier 331).

We can also try a kind of mixed model:

summary(lm(log(TPY) ~ NUMBED + log(SQRFOOT), data = data_2001))
## 
## Call:
## lm(formula = log(TPY) ~ NUMBED + log(SQRFOOT), data = data_2001)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.20596 -0.06471  0.04114  0.10897  0.37224 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.8948843  0.0747232  38.741  < 2e-16 ***
## NUMBED       0.0076163  0.0003292  23.135  < 2e-16 ***
## log(SQRFOOT) 0.1982214  0.0258945   7.655 1.98e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1817 on 343 degrees of freedom
## Multiple R-squared:  0.8688, Adjusted R-squared:  0.868 
## F-statistic:  1135 on 2 and 343 DF,  p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(lm(log(TPY) ~ NUMBED + log(SQRFOOT), data = data_2001))

AIC(lm(log(TPY) ~ NUMBED + log(SQRFOOT), data = data_2001))
## [1] -193.1341

But the results are not encouraging.

Models with inverse and sqrt trasformed TPY

To fix residuals we can also try:

summary(lm(I(1/TPY) ~ NUMBED + SQRFOOT, data = data_2001))
## 
## Call:
## lm(formula = I(1/TPY) ~ NUMBED + SQRFOOT, data = data_2001)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.005553 -0.002945 -0.001930  0.000552  0.056076 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.708e-02  7.253e-04  37.338   <2e-16 ***
## NUMBED      -1.403e-04  1.112e-05 -12.616   <2e-16 ***
## SQRFOOT      1.858e-05  1.566e-05   1.187    0.236    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.005964 on 343 degrees of freedom
## Multiple R-squared:  0.5237, Adjusted R-squared:  0.5209 
## F-statistic: 188.6 on 2 and 343 DF,  p-value: < 2.2e-16
summary(lm(I(sqrt(TPY)) ~ NUMBED + SQRFOOT, data = data_2001))
## 
## Call:
## lm(formula = I(sqrt(TPY)) ~ NUMBED + SQRFOOT, data = data_2001)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6447 -0.2063  0.1194  0.3320  1.8444 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.746642   0.068224   69.58   <2e-16 ***
## NUMBED      0.044967   0.001046   42.98   <2e-16 ***
## SQRFOOT     0.001384   0.001473    0.94    0.348    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.561 on 343 degrees of freedom
## Multiple R-squared:  0.939,  Adjusted R-squared:  0.9386 
## F-statistic:  2640 on 2 and 343 DF,  p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(lm(I(1/TPY) ~ NUMBED + SQRFOOT, data = data_2001))

par(mfrow = c(2, 2))
plot(lm(I(sqrt(TPY)) ~ NUMBED + SQRFOOT, data = data_2001))

the results are even worse than before.

Adding categorical variables

By using the numerical variables only we were not able to obtain a good model. In particular it was not possible to build a model in which all the characteristic assumptions for linear models were met. We can then try to improve our model by adding the categorical predictors.

First of all we can observe two things about the categorical variables:

We can also visualize the previous two observations by graphs:

### roba fatta da Simone

For the moment let’s exclude the variable MCERT and use the most “generic” variables. As a baseline we take the linear model with the higher AIC score.

summary(lm(TPY ~ NUMBED + SQRFOOT + TAXEXEMPT + SELFFUNDINS + URBAN + PRO, data = data_2001))
## 
## Call:
## lm(formula = TPY ~ NUMBED + SQRFOOT + TAXEXEMPT + SELFFUNDINS + 
##     URBAN + PRO, data = data_2001)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -63.033  -2.389   0.887   3.678  32.453 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.30487    1.65318  -0.184  0.85380    
## NUMBED       0.88749    0.01434  61.888  < 2e-16 ***
## SQRFOOT      0.07476    0.02110   3.544  0.00045 ***
## TAXEXEMPT    0.87293    1.40626   0.621  0.53519    
## SELFFUNDINS  0.39001    0.83297   0.468  0.63993    
## URBAN       -1.16575    0.83217  -1.401  0.16217    
## PRO         -0.17355    1.44099  -0.120  0.90421    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.393 on 339 degrees of freedom
## Multiple R-squared:  0.9736, Adjusted R-squared:  0.9732 
## F-statistic:  2086 on 6 and 339 DF,  p-value: < 2.2e-16
anova(glm(TPY ~ NUMBED + SQRFOOT + TAXEXEMPT + SELFFUNDINS + URBAN + PRO, data = data_2001,
          family = gaussian), test = "Chisq")
## Analysis of Deviance Table
## 
## Model: gaussian, link: identity
## 
## Response: TPY
## 
## Terms added sequentially (first to last)
## 
## 
##             Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                          345     702732              
## NUMBED       1   683048       344      19684 < 2.2e-16 ***
## SQRFOOT      1      941       343      18743 3.327e-05 ***
## TAXEXEMPT    1       86       342      18657    0.2105    
## SELFFUNDINS  1       14       341      18643    0.6145    
## URBAN        1      112       340      18531    0.1522    
## PRO          1        1       339      18531    0.9041    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow = c(2, 2))
plot(lm(TPY ~ NUMBED + SQRFOOT + TAXEXEMPT + SELFFUNDINS + URBAN + PRO,
        data = data_2001))

Let’s now try with the log-transformed variables:

summary(lm(log(TPY) ~ log(NUMBED) + log(SQRFOOT) + TAXEXEMPT + SELFFUNDINS + URBAN + PRO,
           data = data_2001))
## 
## Call:
## lm(formula = log(TPY) ~ log(NUMBED) + log(SQRFOOT) + TAXEXEMPT + 
##     SELFFUNDINS + URBAN + PRO, data = data_2001)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.86398 -0.02304  0.01933  0.04979  0.27216 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -0.170131   0.053410  -3.185  0.00158 ** 
## log(NUMBED)   1.002178   0.018924  52.957  < 2e-16 ***
## log(SQRFOOT)  0.019028   0.015588   1.221  0.22307    
## TAXEXEMPT     0.019875   0.017663   1.125  0.26127    
## SELFFUNDINS   0.003665   0.010536   0.348  0.72815    
## URBAN        -0.015259   0.010433  -1.463  0.14452    
## PRO          -0.001832   0.017927  -0.102  0.91868    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09333 on 339 degrees of freedom
## Multiple R-squared:  0.9658, Adjusted R-squared:  0.9652 
## F-statistic:  1595 on 6 and 339 DF,  p-value: < 2.2e-16
anova(glm(log(TPY) ~ log(NUMBED) + log(SQRFOOT) + TAXEXEMPT + SELFFUNDINS + URBAN + PRO,
          data = data_2001, family = gaussian), test = "Chisq")
## Analysis of Deviance Table
## 
## Model: gaussian, link: identity
## 
## Response: log(TPY)
## 
## Terms added sequentially (first to last)
## 
## 
##              Df Deviance Resid. Df Resid. Dev Pr(>Chi)    
## NULL                           345     86.308             
## log(NUMBED)   1   83.268       344      3.040  < 2e-16 ***
## log(SQRFOOT)  1    0.030       343      3.010  0.06278 .  
## TAXEXEMPT     1    0.036       342      2.973  0.04070 *  
## SELFFUNDINS   1    0.001       341      2.972  0.69959    
## URBAN         1    0.019       340      2.953  0.13655    
## PRO           1    0.000       339      2.953  0.91862    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow = c(2, 2))
plot(lm(log(TPY) ~ log(NUMBED) + log(SQRFOOT) + TAXEXEMPT + SELFFUNDINS + URBAN + PRO,
        data = data_2001))

Better, but still the outliers are a problem.

Notice that TAXEXEMPT seems to be the only relevant categorical variable. let’s try to put it as last to check:

anova(glm(log(TPY) ~ log(NUMBED) + log(SQRFOOT) + SELFFUNDINS + URBAN + PRO + TAXEXEMPT,
          data = data_2001, family = gaussian), test = "Chisq")
## Analysis of Deviance Table
## 
## Model: gaussian, link: identity
## 
## Response: log(TPY)
## 
## Terms added sequentially (first to last)
## 
## 
##              Df Deviance Resid. Df Resid. Dev Pr(>Chi)    
## NULL                           345     86.308             
## log(NUMBED)   1   83.268       344      3.040  < 2e-16 ***
## log(SQRFOOT)  1    0.030       343      3.010  0.06278 .  
## SELFFUNDINS   1    0.001       342      3.009  0.72424    
## URBAN         1    0.023       341      2.986  0.10757    
## PRO           1    0.023       340      2.964  0.10796    
## TAXEXEMPT     1    0.011       339      2.953  0.26047    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The only categorical variables that seem to be significant are then PRO and TAXEXEMPT.

Best model

In the end the only acceptable model (that is: the only model whose residuals’ plots confirm that the linear models’ hypoteses are met) seems to be the following.

par(mfrow=c(2,2))
fit.best <- lm(log(TPY) ~ log(SQRFOOT) + TAXEXEMPT + PREV_UTILIZATION_RATE, data = data_2001)
summary(fit.best)
## 
## Call:
## lm(formula = log(TPY) ~ log(SQRFOOT) + TAXEXEMPT + PREV_UTILIZATION_RATE, 
##     data = data_2001)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.34987 -0.14881  0.00171  0.17431  1.01927 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            1.10304    0.17397   6.340 7.25e-10 ***
## log(SQRFOOT)           0.67531    0.02473  27.306  < 2e-16 ***
## TAXEXEMPT             -0.14240    0.03144  -4.529 8.21e-06 ***
## PREV_UTILIZATION_RATE  0.88314    0.17138   5.153 4.34e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2753 on 342 degrees of freedom
## Multiple R-squared:  0.6997, Adjusted R-squared:  0.6971 
## F-statistic: 265.7 on 3 and 342 DF,  p-value: < 2.2e-16
plot(fit.best)

anova(fit.best, test = "Chisq")
## Analysis of Variance Table
## 
## Response: log(TPY)
##                        Df Sum Sq Mean Sq F value    Pr(>F)    
## log(SQRFOOT)            1 57.304  57.304 756.266 < 2.2e-16 ***
## TAXEXEMPT               1  1.078   1.078  14.229 0.0001907 ***
## PREV_UTILIZATION_RATE   1  2.012   2.012  26.554  4.34e-07 ***
## Residuals             342 25.914   0.076                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow = c(2, 2))
plot(fit.best)

Using 2000 to filter 2001

Here we try to use 2000’s UTILIZATION_RATE to filter out outliers from 2001’s data.

First we plot the model for all 2001 data:

summary(lm(log(TPY) ~ log(NUMBED) + TAXEXEMPT, data = data_2001))
## 
## Call:
## lm(formula = log(TPY) ~ log(NUMBED) + TAXEXEMPT, data = data_2001)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.86517 -0.02326  0.01758  0.04951  0.29827 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.18386    0.04685  -3.924 0.000105 ***
## log(NUMBED)  1.01913    0.01042  97.799  < 2e-16 ***
## TAXEXEMPT    0.02594    0.01037   2.501 0.012837 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09329 on 343 degrees of freedom
## Multiple R-squared:  0.9654, Adjusted R-squared:  0.9652 
## F-statistic:  4787 on 2 and 343 DF,  p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(lm(log(TPY) ~ log(NUMBED) + TAXEXEMPT, data = data_2001))

data_2001_trunc <- data_2001[data_2001$PREV_UTILIZATION_RATE > 0.85 &
                               data_2001$PREV_UTILIZATION_RATE < 1.07, ]
summary(lm(log(TPY) ~ log(NUMBED) + TAXEXEMPT, data = data_2001_trunc))
## 
## Call:
## lm(formula = log(TPY) ~ log(NUMBED) + TAXEXEMPT, data = data_2001_trunc)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.218415 -0.024751  0.007205  0.033057  0.282089 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.087348   0.029536  -2.957 0.003352 ** 
## log(NUMBED)  1.002350   0.006562 152.750  < 2e-16 ***
## TAXEXEMPT    0.021116   0.006317   3.343 0.000936 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05327 on 297 degrees of freedom
## Multiple R-squared:  0.9875, Adjusted R-squared:  0.9874 
## F-statistic: 1.169e+04 on 2 and 297 DF,  p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(lm(log(TPY) ~ log(NUMBED) + TAXEXEMPT, data = data_2001_trunc))

We will use the interval found for 2000 to improve the model: